In today's market, many companies have a mobile presence. Often these companies provide free products/services in their mobile apps in an attempt to transition their customers to a paid membership. Some examples of paid products, which originate from free ones, are YouTube Red, Pandora Premium, Netflix, Disney+Hotstar, AmazonPrime, Audible Subscription and Spotify. Since marketing efforts are never free, these companies need to know exactly who to target with offers and promotions.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from dateutil import parser
dataset = pd.read_csv('appdata10.csv')
dataset.head()
| user | first_open | dayofweek | hour | age | screen_list | numscreens | minigame | used_premium_feature | enrolled | enrolled_date | liked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 235136 | 2012-12-27 02:14:51.273 | 3 | 02:00:00 | 23 | idscreen,joinscreen,Cycle,product_review,ScanP... | 15 | 0 | 0 | 0 | NaN | 0 |
| 1 | 333588 | 2012-12-02 01:16:00.905 | 6 | 01:00:00 | 24 | joinscreen,product_review,product_review2,Scan... | 13 | 0 | 0 | 0 | NaN | 0 |
| 2 | 254414 | 2013-03-19 19:19:09.157 | 1 | 19:00:00 | 23 | Splash,Cycle,Loan | 3 | 0 | 1 | 0 | NaN | 1 |
| 3 | 234192 | 2013-07-05 16:08:46.354 | 4 | 16:00:00 | 28 | product_review,Home,product_review,Loan3,Finan... | 40 | 0 | 0 | 1 | 2013-07-05 16:11:49.513 | 0 |
| 4 | 51549 | 2013-02-26 18:50:48.661 | 1 | 18:00:00 | 31 | idscreen,joinscreen,Cycle,Credit3Container,Sca... | 32 | 0 | 0 | 1 | 2013-02-26 18:56:37.841 | 1 |
dataset.isnull().sum()
user 0 first_open 0 dayofweek 0 hour 0 age 0 screen_list 0 numscreens 0 minigame 0 used_premium_feature 0 enrolled 0 enrolled_date 18926 liked 0 dtype: int64
dataset.describe()
| user | dayofweek | age | numscreens | minigame | used_premium_feature | enrolled | liked | |
|---|---|---|---|---|---|---|---|---|
| count | 50000.000000 | 50000.000000 | 50000.00000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 |
| mean | 186889.729900 | 3.029860 | 31.72436 | 21.095900 | 0.107820 | 0.172020 | 0.621480 | 0.165000 |
| std | 107768.520361 | 2.031997 | 10.80331 | 15.728812 | 0.310156 | 0.377402 | 0.485023 | 0.371184 |
| min | 13.000000 | 0.000000 | 16.00000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 93526.750000 | 1.000000 | 24.00000 | 10.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 187193.500000 | 3.000000 | 29.00000 | 18.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
| 75% | 279984.250000 | 5.000000 | 37.00000 | 28.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
| max | 373662.000000 | 6.000000 | 101.00000 | 325.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
dataset['hour'] = dataset['hour'].str.slice(1, 3).astype(int)
dataset.head()
| user | first_open | dayofweek | hour | age | screen_list | numscreens | minigame | used_premium_feature | enrolled | enrolled_date | liked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 235136 | 2012-12-27 02:14:51.273 | 3 | 2 | 23 | idscreen,joinscreen,Cycle,product_review,ScanP... | 15 | 0 | 0 | 0 | NaN | 0 |
| 1 | 333588 | 2012-12-02 01:16:00.905 | 6 | 1 | 24 | joinscreen,product_review,product_review2,Scan... | 13 | 0 | 0 | 0 | NaN | 0 |
| 2 | 254414 | 2013-03-19 19:19:09.157 | 1 | 19 | 23 | Splash,Cycle,Loan | 3 | 0 | 1 | 0 | NaN | 1 |
| 3 | 234192 | 2013-07-05 16:08:46.354 | 4 | 16 | 28 | product_review,Home,product_review,Loan3,Finan... | 40 | 0 | 0 | 1 | 2013-07-05 16:11:49.513 | 0 |
| 4 | 51549 | 2013-02-26 18:50:48.661 | 1 | 18 | 31 | idscreen,joinscreen,Cycle,Credit3Container,Sca... | 32 | 0 | 0 | 1 | 2013-02-26 18:56:37.841 | 1 |
dataset.describe()
| user | dayofweek | hour | age | numscreens | minigame | used_premium_feature | enrolled | liked | |
|---|---|---|---|---|---|---|---|---|---|
| count | 50000.000000 | 50000.000000 | 50000.000000 | 50000.00000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 | 50000.000000 |
| mean | 186889.729900 | 3.029860 | 12.557220 | 31.72436 | 21.095900 | 0.107820 | 0.172020 | 0.621480 | 0.165000 |
| std | 107768.520361 | 2.031997 | 7.438072 | 10.80331 | 15.728812 | 0.310156 | 0.377402 | 0.485023 | 0.371184 |
| min | 13.000000 | 0.000000 | 0.000000 | 16.00000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 93526.750000 | 1.000000 | 5.000000 | 24.00000 | 10.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 187193.500000 | 3.000000 | 14.000000 | 29.00000 | 18.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
| 75% | 279984.250000 | 5.000000 | 19.000000 | 37.00000 | 28.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 |
| max | 373662.000000 | 6.000000 | 23.000000 | 101.00000 | 325.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
dataset_copy = dataset.copy().drop(columns = ['user', 'screen_list', 'enrolled_date', 'first_open', 'enrolled'])
dataset_copy.head()
| dayofweek | hour | age | numscreens | minigame | used_premium_feature | liked | |
|---|---|---|---|---|---|---|---|
| 0 | 3 | 2 | 23 | 15 | 0 | 0 | 0 |
| 1 | 6 | 1 | 24 | 13 | 0 | 0 | 0 |
| 2 | 1 | 19 | 23 | 3 | 0 | 1 | 1 |
| 3 | 4 | 16 | 28 | 40 | 0 | 0 | 0 |
| 4 | 1 | 18 | 31 | 32 | 0 | 0 | 1 |
plt.figure(figsize=(15, 12))
plt.suptitle('Histograms of Numerical Columns', fontsize = 20)
for i in range(1, dataset_copy.shape[1] + 1):
plt.subplot(3, 3, i)
f = plt.gca()
f.set_title(dataset_copy.columns.values[i - 1])
vals = np.size(dataset_copy.iloc[:, i - 1].unique())
plt.hist(dataset_copy.iloc[:, i - 1], bins = vals, color = '#3F5D7D')
#plt.savefig('Histograms of Numerical Columns.png')
# Heatmap Using Seaborn
plt.figure(figsize = (15, 10))
sns.heatmap(dataset.corr(), annot = True, cmap = 'ocean', linewidths= 1, linecolor = 'black')
#plt.savefig('Heatmap Using Seaborn.png')
<AxesSubplot:>
# Heatmap Using Plotly
px.imshow(dataset.corr(), color_continuous_scale = 'tealrose')
# Just trying a 3d Scatter PLot
px.scatter_3d(dataset, 'age', 'numscreens', 'hour', color = 'enrolled')
dataset.dtypes
user int64 first_open object dayofweek int64 hour int32 age int64 screen_list object numscreens int64 minigame int64 used_premium_feature int64 enrolled int64 enrolled_date object liked int64 dtype: object
dataset['first_open'] = [parser.parse(row_data) for row_data in dataset['first_open']]
dataset['enrolled_date'] = [parser.parse(row_data)if isinstance(row_data, str) else row_data for row_data in dataset['enrolled_date']]
dataset.dtypes
user int64 first_open datetime64[ns] dayofweek int64 hour int32 age int64 screen_list object numscreens int64 minigame int64 used_premium_feature int64 enrolled int64 enrolled_date datetime64[ns] liked int64 dtype: object
dataset['difference'] = (dataset.enrolled_date - dataset.first_open).astype('timedelta64[h]')
dataset.head(20)
| user | first_open | dayofweek | hour | age | screen_list | numscreens | minigame | used_premium_feature | enrolled | enrolled_date | liked | difference | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 235136 | 2012-12-27 02:14:51.273 | 3 | 2 | 23 | idscreen,joinscreen,Cycle,product_review,ScanP... | 15 | 0 | 0 | 0 | NaT | 0 | NaN |
| 1 | 333588 | 2012-12-02 01:16:00.905 | 6 | 1 | 24 | joinscreen,product_review,product_review2,Scan... | 13 | 0 | 0 | 0 | NaT | 0 | NaN |
| 2 | 254414 | 2013-03-19 19:19:09.157 | 1 | 19 | 23 | Splash,Cycle,Loan | 3 | 0 | 1 | 0 | NaT | 1 | NaN |
| 3 | 234192 | 2013-07-05 16:08:46.354 | 4 | 16 | 28 | product_review,Home,product_review,Loan3,Finan... | 40 | 0 | 0 | 1 | 2013-07-05 16:11:49.513 | 0 | 0.0 |
| 4 | 51549 | 2013-02-26 18:50:48.661 | 1 | 18 | 31 | idscreen,joinscreen,Cycle,Credit3Container,Sca... | 32 | 0 | 0 | 1 | 2013-02-26 18:56:37.841 | 1 | 0.0 |
| 5 | 56480 | 2013-04-03 09:58:15.752 | 2 | 9 | 20 | idscreen,Cycle,Home,ScanPreview,VerifyPhone,Ve... | 14 | 0 | 0 | 1 | 2013-04-03 09:59:03.291 | 0 | 0.0 |
| 6 | 144649 | 2012-12-25 02:33:18.461 | 1 | 2 | 35 | product_review,product_review2,ScanPreview | 3 | 0 | 0 | 0 | NaT | 0 | NaN |
| 7 | 249366 | 2012-12-11 03:07:49.875 | 1 | 3 | 26 | Splash,Cycle,Home,Credit3Container,Credit3Dash... | 41 | 0 | 1 | 0 | NaT | 0 | NaN |
| 8 | 372004 | 2013-03-20 14:22:01.569 | 2 | 14 | 29 | product_review,product_review2,ScanPreview,Ver... | 33 | 1 | 1 | 1 | 2013-04-27 22:24:54.542 | 0 | 920.0 |
| 9 | 338013 | 2013-04-26 18:22:16.013 | 4 | 18 | 26 | Home,Loan2,product_review,product_review,produ... | 19 | 0 | 0 | 1 | 2013-04-26 18:31:58.923 | 0 | 0.0 |
| 10 | 43555 | 2013-05-14 04:48:27.597 | 1 | 4 | 39 | Splash,idscreen,Home,RewardsContainer,Settings... | 14 | 0 | 0 | 1 | 2013-05-15 21:02:17.200 | 0 | 40.0 |
| 11 | 317454 | 2013-05-28 11:07:07.358 | 1 | 11 | 32 | product_review,Home,Loan2,Credit3Container,Ver... | 25 | 1 | 1 | 0 | NaT | 0 | NaN |
| 12 | 205375 | 2012-12-17 06:28:45.903 | 0 | 6 | 25 | idscreen,joinscreen,Cycle,product_review,produ... | 11 | 0 | 0 | 0 | NaT | 0 | NaN |
| 13 | 307608 | 2013-05-25 19:52:31.798 | 5 | 19 | 23 | Alerts,ProfilePage,Home,Credit3Container | 4 | 0 | 0 | 1 | 2013-06-18 14:27:42.824 | 0 | 570.0 |
| 14 | 359855 | 2013-02-18 04:48:48.912 | 0 | 4 | 17 | joinscreen,product_review,product_review2,Scan... | 9 | 0 | 0 | 0 | NaT | 0 | NaN |
| 15 | 284938 | 2013-02-02 18:41:35.724 | 5 | 18 | 25 | idscreen,joinscreen,Cycle,Loan2,product_review... | 26 | 1 | 0 | 1 | 2013-04-29 21:10:04.466 | 0 | 2066.0 |
| 16 | 235143 | 2013-07-07 16:07:35.057 | 6 | 16 | 21 | product_review,product_review,product_review,p... | 6 | 0 | 0 | 1 | 2013-07-08 16:24:09.052 | 0 | 24.0 |
| 17 | 141402 | 2013-02-02 21:12:46.888 | 5 | 21 | 55 | joinscreen,Cycle,product_review,Loan2,product_... | 20 | 0 | 0 | 1 | 2013-02-11 01:35:03.098 | 0 | 196.0 |
| 18 | 257945 | 2013-05-10 05:59:43.405 | 4 | 5 | 32 | Splash,product_review,Home,Loan2,product_revie... | 15 | 0 | 0 | 1 | 2013-05-11 04:29:36.906 | 1 | 22.0 |
| 19 | 54931 | 2013-07-06 17:34:46.439 | 5 | 17 | 25 | idscreen,Loan3,product_review,product_review,Home | 5 | 0 | 0 | 1 | 2013-07-06 18:55:54.215 | 0 | 1.0 |
plt.hist(dataset['difference'].dropna(), range = [0, 48])
plt.title('Distribution of Time-Since-Enrolled')
plt.show()
#plt.savefig('Distribution of Time Since Enrolled.png')
# First we remove every user who took more than 48 hours to enroll (mark them as 0).
# Next we remove some columns which no longer serve the purpose
dataset.loc[dataset.difference > 48, 'enrolled'] = 0
dataset = dataset.drop(columns = ['difference', 'enrolled_date', 'first_open'])
top_screens = pd.read_csv('top_screens.csv').top_screens.values
dataset['screen_list'] = dataset['screen_list'].astype(str) + ','
for sc in top_screens:
dataset[sc] = dataset.screen_list.str.contains(sc).astype(int)
dataset['screen_list'] = dataset['screen_list'].str.replace(sc + ',', '')
dataset['other'] = dataset['screen_list'].str.count(',')
dataset = dataset.drop(columns = ['screen_list'])
saving_screens = ['Saving1', 'Saving2', 'Saving2Amount', 'Saving4','Saving5','Saving6','Saving7','Saving8','Saving9','Saving10']
dataset['SavingsCount'] = dataset[saving_screens].sum(axis = 1)
dataset = dataset.drop(columns = saving_screens)
cm_screens = ['Credit1', 'Credit2','Credit3','Credit3Container','Credit3Dashboard']
dataset['CMCount'] = dataset[cm_screens].sum(axis = 1)
dataset = dataset.drop(columns = cm_screens)
cc_screens = ['CC1', 'CC1Category','CC3']
dataset['CCCount'] = dataset[cc_screens].sum(axis = 1)
dataset = dataset.drop(columns = cc_screens)
loan_screens = ['Loan', 'Loan2','Loan3','Loan4']
dataset['LoansCount'] = dataset[loan_screens].sum(axis = 1)
dataset = dataset.drop(columns = loan_screens)
dataset.head()
| user | dayofweek | hour | age | numscreens | minigame | used_premium_feature | enrolled | liked | location | ... | SecurityModal | ResendToken | TransactionList | NetworkFailure | ListPicker | other | SavingsCount | CMCount | CCCount | LoansCount | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 235136 | 3 | 2 | 23 | 15 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 7 | 0 | 0 | 0 | 1 |
| 1 | 333588 | 6 | 1 | 24 | 13 | 0 | 0 | 0 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 5 | 0 | 0 | 0 | 1 |
| 2 | 254414 | 1 | 19 | 23 | 3 | 0 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 3 | 234192 | 4 | 16 | 28 | 40 | 0 | 0 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 0 | 6 | 0 | 3 | 0 | 1 |
| 4 | 51549 | 1 | 18 | 31 | 32 | 0 | 0 | 1 | 1 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 10 | 0 | 2 | 0 | 1 |
5 rows × 50 columns
dataset.columns
Index(['user', 'dayofweek', 'hour', 'age', 'numscreens', 'minigame',
'used_premium_feature', 'enrolled', 'liked', 'location', 'Institutions',
'VerifyPhone', 'BankVerification', 'VerifyDateOfBirth', 'ProfilePage',
'VerifyCountry', 'Cycle', 'idscreen', 'Splash', 'RewardsContainer',
'EditProfile', 'Finances', 'Alerts', 'Leaderboard', 'VerifyMobile',
'VerifyHousing', 'RewardDetail', 'VerifyHousingAmount',
'ProfileMaritalStatus', 'ProfileChildren ', 'ProfileEducation',
'ProfileEducationMajor', 'Rewards', 'AccountView', 'VerifyAnnualIncome',
'VerifyIncomeType', 'ProfileJobTitle', 'Login',
'ProfileEmploymentLength', 'WebView', 'SecurityModal', 'ResendToken',
'TransactionList', 'NetworkFailure', 'ListPicker', 'other',
'SavingsCount', 'CMCount', 'CCCount', 'LoansCount'],
dtype='object')
dataset.to_csv('new_appdata10.csv', index = False)
dataset = pd.read_csv('new_appdata10.csv')
response = dataset['enrolled']
dataset = dataset.drop(columns = 'enrolled')
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset, response, test_size = 0.2, random_state = 0)
# We don't need the user identifier column for our model right now but will need it in the to identify the predictions
# for each user. So we save it first and then remove it from our X_train & X_test column.
train_identifier = X_train['user']
X_train = X_train.drop(columns = 'user')
test_identifier = X_test['user']
X_test = X_test.drop(columns = 'user')
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train2 = pd.DataFrame(sc.fit_transform(X_train))
X_test2 = pd.DataFrame(sc.transform(X_test))
X_train2.columns = X_train.columns.values
X_test2.columns = X_test.columns.values
X_train2.index = X_train.index.values
X_test2.index = X_test.index.values
X_train = X_train2
X_test = X_test2
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0, penalty = 'l1', solver = 'liblinear')
classifier.fit(X_train, y_train)
LogisticRegression(penalty='l1', random_state=0, solver='liblinear')
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, classification_report
# Confusion Matrix & Accuracy Score
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot = True, fmt = 'g')
accuracy_score(y_test, y_pred)
#plt.savefig('Confusion Matrix.png')
0.7681
# Precision Score
precision_score(y_test, y_pred)
0.7618952017667135
# Recall Score
recall_score(y_test, y_pred)
0.7700892857142857
# F1 Score
f1_score(y_test, y_pred)
0.7659703300030276
# A full Classification Report
print(classification_report(y_test, y_pred))
precision recall f1-score support
0 0.77 0.77 0.77 5072
1 0.76 0.77 0.77 4928
accuracy 0.77 10000
macro avg 0.77 0.77 0.77 10000
weighted avg 0.77 0.77 0.77 10000
# K-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Logistic Regression Mean Accuracy: %0.3f" % (accuracies.mean()))
print("Logistic Regression Standard Deviation: %0.3f" % (accuracies.std() * 2))
Logistic Regression Mean Accuracy: 0.767 Logistic Regression Standard Deviation: 0.009
final_results = pd.concat([y_test, test_identifier], axis = 1)
final_results['predicted_results'] = y_pred
final_results[['user', 'enrolled', 'predicted_results']].reset_index(drop = True)
| user | enrolled | predicted_results | |
|---|---|---|---|
| 0 | 239786 | 1 | 1 |
| 1 | 279644 | 1 | 1 |
| 2 | 98290 | 0 | 0 |
| 3 | 170150 | 1 | 1 |
| 4 | 237568 | 1 | 1 |
| ... | ... | ... | ... |
| 9995 | 143036 | 1 | 0 |
| 9996 | 91158 | 1 | 1 |
| 9997 | 248318 | 0 | 0 |
| 9998 | 142418 | 1 | 1 |
| 9999 | 279355 | 1 | 1 |
10000 rows × 3 columns